{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#encoding=utf8\n",
      "import pymongo\n",
      "import sys\n",
      "import bson\n",
      "sys.path.append('..')\n",
      "import mysimhash \n",
      "Article = pymongo.Connection().stock.Article\n",
      "\n",
      "\n",
      "def pr(t):\n",
      "    print json.dumps(t, ensure_ascii=False,indent=2)\n",
      "id=0\n",
      "bu = mysimhash.SimhashIndex({},k=50)\n",
      "for i in Article.find({},{\"_id\":1,\"sim\":1}).limit(10000):\n",
      "    obj_id = i['_id']\n",
      "    hash_int = mysimhash.Simhash(int(i['sim']))\n",
      "    bu.add( obj_id,hash_int)\n",
      "\n",
      "bu.k=13\n",
      "for item in Article.find().limit(10):\n",
      "    obj_id = str(item['_id'])\n",
      "    sim = mysimhash.Simhash(int(item['sim']))\n",
      "#     print sim,\n",
      "    articles = bu.get_near_dups(sim)\n",
      "    print item['title'],item['_id']\n",
      "    for i in articles:\n",
      "        if i[1]==obj_id:continue\n",
      "        art = Article.find_one({\"_id\":bson.objectid.ObjectId(i[1])})\n",
      "        print '\\t',art['title'],art['_id'],art['url']\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(5/15)\u5341\u5927\u535a\u6587\u7cbe\u7cb9\uff1a\u4e09\u8fde\u9634\u540e\u5c06\u5982\u4f55\u6f14\u53d8\uff1f 5374c1267f949f34902c46f1\n",
        "\t\u9884\u9632\u5f0f\u62c9\u5347\u6210\u5e38\u6001 \u8010\u5fc3\u4f4e\u5438\u4e00\u7c7b\u80a1 5375bc577f949f4338bf4f0f http://stock.eastmoney.com/news/1405,20140516385046727.html\n",
        "\t\u5b9d\u5229\u6765\u201c\u8c6a\u8d4c\u201d\u624b\u6e38 \u80a1\u4ef7\u63a5\u8fd1\u6da8\u505c\u5f00\u76d8 5374c1807f949f34902c4803 http://stock.eastmoney.com/news/1406,20140515384587670.html\n",
        "\t\u822a\u5929\u519b\u5de5\u677f\u5757\u5f3a\u52bf\u62c9\u5347 \u535a\u4e91\u65b0\u6750\u6da8\u505c 5375bc897f949f4338bf4f86 http://stock.eastmoney.com/news/1406,20140516384976283.html\n",
        "\t\u5168\u7403\u589e\u957f\u62c5\u5fe7\u65bd\u538b \u6b27\u80a1\u6307\u6570\u5468\u56db\u8dcc0.91% 5375bc767f949f4338bf4f42 http://stock.eastmoney.com/news/1438,20140516384815028.html\n",
        "\t\u56fd\u7f51\u516b\u5927\u7279\u9ad8\u538b\u83b7\u51c6 12\u80a1\u76db\u5bb4\u5f00\u5e2d 5374c1757f949f34902c47da http://stock.eastmoney.com/news/1406,20140515384486198.html\n",
        "\u8010\u5fc3\u4f4e\u5438\u4e09\u7c7b\u80a1 5374c12c7f949f34902c46fd\n",
        "\t\u63a2\u8def\u8005\uff1a\u7ecf\u8425\u826f\u597d \u6218\u7565\u63a8\u8fdb 5375bccf7f949f4338bf5069 http://stock.eastmoney.com/news/1417,20140516385022467.html\n",
        "\t\u878d\u521b\u62df\u6536\u8d2d\u7eff\u57ce\u4e0d\u8d85\u8fc730%\u80a1\u4efd 5374c1897f949f34902c4826 http://stock.eastmoney.com/news/1532,20140515384754861.html\n",
        "\t\u5de5\u4fe1\u90e8\u63a8\u52a8\u8282\u6c34\u88c5\u5907\u4ea7\u4e1a\u5316 \u52a0\u5f3a\u5de5\u4e1a\u7528\u6c34\u8282\u7ea6 5374c13e7f949f34902c4717 http://stock.eastmoney.com/news/1406,20140515384740122.html\n",
        "\t\u5e7f\u897f\u6d77\u4e0a\u4e1d\u7ef8\u4e4b\u8def\u91cd\u70b9\u63a8\u8fdb8\u5927\u4ea7\u4e1a\u5408\u4f5c 5374c15d7f949f34902c4781 http://stock.eastmoney.com/news/1406,20140515384604918.html\n",
        "\t\u53d1\u6539\u59d4\u5b9e\u65bd\u5883\u5916\u6295\u8d44\u9879\u76ee\u6838\u51c6\u548c\u5907\u6848\u7ba1\u7406\u529e\u6cd5 5375bc8d7f949f4338bf4f93 http://stock.eastmoney.com/news/1406,20140516384983129.html\n",
        "\u4e2d\u56fd\u9a7b\u8d8a\u5357\u5927\u4f7f\u9986\u518d\u6b21\u63d0\u9192\u516c\u6c11\u6ce8\u610f\u4eba\u8eab\u8d22\u4ea7\u5b89\u5168 5374c1397f949f34902c4709\n",
        "\t\u9884\u9632\u5f0f\u62c9\u5347\u6210\u5e38\u6001 \u8010\u5fc3\u4f4e\u5438\u4e00\u7c7b\u80a1 5375bc577f949f4338bf4f0f http://stock.eastmoney.com/news/1405,20140516385046727.html\n",
        "\t\u5b9d\u5229\u6765\u201c\u8c6a\u8d4c\u201d\u624b\u6e38 \u80a1\u4ef7\u63a5\u8fd1\u6da8\u505c\u5f00\u76d8 5374c1807f949f34902c4803 http://stock.eastmoney.com/news/1406,20140515384587670.html\n",
        "\t\u822a\u5929\u519b\u5de5\u677f\u5757\u5f3a\u52bf\u62c9\u5347 \u535a\u4e91\u65b0\u6750\u6da8\u505c 5375bc897f949f4338bf4f86 http://stock.eastmoney.com/news/1406,20140516384976283.html\n",
        "\t\u56fd\u7f51\u516b\u5927\u7279\u9ad8\u538b\u83b7\u51c6 12\u80a1\u76db\u5bb4\u5f00\u5e2d 5374c1757f949f34902c47da http://stock.eastmoney.com/news/1406,20140515384486198.html\n",
        "\t\u4e0a\u5e02\u94f6\u884c\u9ad8\u7ba1\u9996\u6b21\u96c6\u4f53\u8d2d\u5165\u81ea\u5bb6\u80a1\u7968 5374c1a37f949f34902c4881 http://stock.eastmoney.com/news/1349,20140515384453398.html\n",
        "\u6210\u4ea4\u91cf\u840e\u7f29\u7591\u8bf1\u7a7a \u6295\u8d44\u8005\u9700\u8c28\u614e 5374c14d7f949f34902c4748\n",
        "\t\u9884\u9632\u5f0f\u62c9\u5347\u6210\u5e38\u6001 \u8010\u5fc3\u4f4e\u5438\u4e00\u7c7b\u80a1 5375bc577f949f4338bf4f0f http://stock.eastmoney.com/news/1405,20140516385046727.html\n",
        "\t\u5b9d\u5229\u6765\u201c\u8c6a\u8d4c\u201d\u624b\u6e38 \u80a1\u4ef7\u63a5\u8fd1\u6da8\u505c\u5f00\u76d8 5374c1807f949f34902c4803 http://stock.eastmoney.com/news/1406,20140515384587670.html\n",
        "\t\u822a\u5929\u519b\u5de5\u677f\u5757\u5f3a\u52bf\u62c9\u5347 \u535a\u4e91\u65b0\u6750\u6da8\u505c 5375bc897f949f4338bf4f86 http://stock.eastmoney.com/news/1406,20140516384976283.html\n",
        "\t\u5168\u7403\u589e\u957f\u62c5\u5fe7\u65bd\u538b \u6b27\u80a1\u6307\u6570\u5468\u56db\u8dcc0.91% 5375bc767f949f4338bf4f42 http://stock.eastmoney.com/news/1438,20140516384815028.html\n",
        "\t\u56fd\u7f51\u516b\u5927\u7279\u9ad8\u538b\u83b7\u51c6 12\u80a1\u76db\u5bb4\u5f00\u5e2d 5374c1757f949f34902c47da http://stock.eastmoney.com/news/1406,20140515384486198.html\n",
        "\u5f20\u88d5B\u88ab\u5254\u9664MSCI\u4e2d\u56fd\u6307\u6570 5374c1617f949f34902c4791\n",
        "\t\u878d\u521b\u62df\u6536\u8d2d\u7eff\u57ce\u4e0d\u8d85\u8fc730%\u80a1\u4efd 5374c1897f949f34902c4826 http://stock.eastmoney.com/news/1532,20140515384754861.html\n",
        "\t\u6caa\u6df1B\u6307\u53cc\u53cc\u6536\u8dcc \u6caa\u5e02B\u6307\u8dcc\u5e45\u4e3a0.32% 5374c18a7f949f34902c4827 http://stock.eastmoney.com/news/1432,20140515384726288.html\n",
        "\t\u65b0\u56fd\u4e5d\u6761\u4f20\u9012\u8d85\u80fd\u91cf \u4e09\u5927\u671f\u4ea4\u6240\u7ed8\u672a\u6765\u84dd\u56fe 5375bc947f949f4338bf4fae http://stock.eastmoney.com/news/1406,20140516384844550.html\n",
        "\t\u7535\u5546\u4e1a\u7ee9\u5206\u5316 \u552f\u54c1\u4f1a\u5927\u6da8\u5f53\u5f53\u7f51\u66b4\u8dcc 5375bc9f7f949f4338bf4fd8 http://stock.eastmoney.com/news/1406,20140516384947650.html\n",
        "\t\u4e3b\u677f\u5fae\u6da8\u521b\u4e1a\u677f\u66b4\u8dcc \u5e02\u573a\u6025\u76fc\u8d44\u91d1\u6d3b\u6c34 5375bcbb7f949f4338bf5040 http://stock.eastmoney.com/news/1407,20140516385054571.html\n",
        "\u53ca\u65f6\u9501\u5b9a\u90e8\u5206\u76c8\u5229 5374c12c7f949f34902c46fb\n",
        "\t\u878d\u521b\u62df\u6536\u8d2d\u7eff\u57ce\u4e0d\u8d85\u8fc730%\u80a1\u4efd 5374c1897f949f34902c4826 http://stock.eastmoney.com/news/1532,20140515384754861.html\n",
        "\t\u6caa\u6df1B\u6307\u53cc\u53cc\u6536\u8dcc \u6caa\u5e02B\u6307\u8dcc\u5e45\u4e3a0.32% 5374c18a7f949f34902c4827 http://stock.eastmoney.com/news/1432,20140515384726288.html\n",
        "\t\u65b0\u56fd\u4e5d\u6761\u4f20\u9012\u8d85\u80fd\u91cf \u4e09\u5927\u671f\u4ea4\u6240\u7ed8\u672a\u6765\u84dd\u56fe 5375bc947f949f4338bf4fae http://stock.eastmoney.com/news/1406,20140516384844550.html\n",
        "\t\u7535\u5546\u4e1a\u7ee9\u5206\u5316 \u552f\u54c1\u4f1a\u5927\u6da8\u5f53\u5f53\u7f51\u66b4\u8dcc 5375bc9f7f949f4338bf4fd8 http://stock.eastmoney.com/news/1406,20140516384947650.html\n",
        "\t\u4e3b\u677f\u5fae\u6da8\u521b\u4e1a\u677f\u66b4\u8dcc \u5e02\u573a\u6025\u76fc\u8d44\u91d1\u6d3b\u6c34 5375bcbb7f949f4338bf5040 http://stock.eastmoney.com/news/1407,20140516385054571.html\n",
        "\u5173\u6ce8\u53d7\u76ca\u653f\u7b56\u7684\u7ee9\u4f18\u80a1"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 5374c1317f949f34902c4701\n",
        "\t\u9884\u9632\u5f0f\u62c9\u5347\u6210\u5e38\u6001 \u8010\u5fc3\u4f4e\u5438\u4e00\u7c7b\u80a1 5375bc577f949f4338bf4f0f http://stock.eastmoney.com/news/1405,20140516385046727.html\n",
        "\t\u5b9d\u5229\u6765\u201c\u8c6a\u8d4c\u201d\u624b\u6e38 \u80a1\u4ef7\u63a5\u8fd1\u6da8\u505c\u5f00\u76d8 5374c1807f949f34902c4803 http://stock.eastmoney.com/news/1406,20140515384587670.html\n",
        "\t\u822a\u5929\u519b\u5de5\u677f\u5757\u5f3a\u52bf\u62c9\u5347 \u535a\u4e91\u65b0\u6750\u6da8\u505c 5375bc897f949f4338bf4f86 http://stock.eastmoney.com/news/1406,20140516384976283.html\n",
        "\t\u5168\u7403\u589e\u957f\u62c5\u5fe7\u65bd\u538b \u6b27\u80a1\u6307\u6570\u5468\u56db\u8dcc0.91% 5375bc767f949f4338bf4f42 http://stock.eastmoney.com/news/1438,20140516384815028.html\n",
        "\t\u56fd\u7f51\u516b\u5927\u7279\u9ad8\u538b\u83b7\u51c6 12\u80a1\u76db\u5bb4\u5f00\u5e2d 5374c1757f949f34902c47da http://stock.eastmoney.com/news/1406,20140515384486198.html\n",
        "\u7f8e\u8d22\u4ea7\u516c\u5f00\u6cd5\u6216\u5f15\u53d1\u201c\u9000\u7c4d\u6f6e\u201d 5374c1337f949f34902c4704\n",
        "\t\u9884\u9632\u5f0f\u62c9\u5347\u6210\u5e38\u6001 \u8010\u5fc3\u4f4e\u5438\u4e00\u7c7b\u80a1 5375bc577f949f4338bf4f0f http://stock.eastmoney.com/news/1405,20140516385046727.html\n",
        "\t\u5b9d\u5229\u6765\u201c\u8c6a\u8d4c\u201d\u624b\u6e38 \u80a1\u4ef7\u63a5\u8fd1\u6da8\u505c\u5f00\u76d8 5374c1807f949f34902c4803 http://stock.eastmoney.com/news/1406,20140515384587670.html\n",
        "\t\u822a\u5929\u519b\u5de5\u677f\u5757\u5f3a\u52bf\u62c9\u5347 \u535a\u4e91\u65b0\u6750\u6da8\u505c 5375bc897f949f4338bf4f86 http://stock.eastmoney.com/news/1406,20140516384976283.html\n",
        "\t\u5168\u7403\u589e\u957f\u62c5\u5fe7\u65bd\u538b \u6b27\u80a1\u6307\u6570\u5468\u56db\u8dcc0.91% 5375bc767f949f4338bf4f42 http://stock.eastmoney.com/news/1438,20140516384815028.html\n",
        "\t\u56fd\u7f51\u516b\u5927\u7279\u9ad8\u538b\u83b7\u51c6 12\u80a1\u76db\u5bb4\u5f00\u5e2d 5374c1757f949f34902c47da http://stock.eastmoney.com/news/1406,20140515384486198.html\n",
        "\u5468\u56db\u7968\u636e\u8f6c\u8d34\u5229\u7387\u548c\u7968\u636e\u76f4\u8d34\u5229\u7387\u4f01\u7a33\u53cd\u5f39 5374c1397f949f34902c4708\n",
        "\u65b0\u201c\u56fd\u4e5d\u6761\u201d\u7684\u6838\u5fc3\u662f\u79ef\u6781\u7a33\u59a5\u63a8\u52a8\u6ce8\u518c\u5236\u6539\u9769 5374c13f7f949f34902c471b\n",
        "\t\u9884\u9632\u5f0f\u62c9\u5347\u6210\u5e38\u6001 \u8010\u5fc3\u4f4e\u5438\u4e00\u7c7b\u80a1 5375bc577f949f4338bf4f0f http://stock.eastmoney.com/news/1405,20140516385046727.html\n",
        "\t\u5b9d\u5229\u6765\u201c\u8c6a\u8d4c\u201d\u624b\u6e38 \u80a1\u4ef7\u63a5\u8fd1\u6da8\u505c\u5f00\u76d8 5374c1807f949f34902c4803 http://stock.eastmoney.com/news/1406,20140515384587670.html\n",
        "\t\u822a\u5929\u519b\u5de5\u677f\u5757\u5f3a\u52bf\u62c9\u5347 \u535a\u4e91\u65b0\u6750\u6da8\u505c 5375bc897f949f4338bf4f86 http://stock.eastmoney.com/news/1406,20140516384976283.html\n",
        "\t\u56fd\u7f51\u516b\u5927\u7279\u9ad8\u538b\u83b7\u51c6 12\u80a1\u76db\u5bb4\u5f00\u5e2d 5374c1757f949f34902c47da http://stock.eastmoney.com/news/1406,20140515384486198.html\n",
        "\t\u4e0a\u5e02\u94f6\u884c\u9ad8\u7ba1\u9996\u6b21\u96c6\u4f53\u8d2d\u5165\u81ea\u5bb6\u80a1\u7968 5374c1a37f949f34902c4881 http://stock.eastmoney.com/news/1349,20140515384453398.html\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}